Load all relevant R libraries.
# Load Libraries
library("RPostgres")
library("ggplot2")
library("tidyverse")
library("dbplyr")
library("lubridate")
library("data.table")
library("dtplyr")
library("ggpubr")
library("muStat")
library("mvnormtest")
library("knitr")
library("kableExtra")
# load the PostgreSQL driver
drv <- dbDriver("Postgres")
# create a connection to the postgres database
# set the search path to the mimiciii schema
con <- dbConnect(drv, dbname = "mimic",
host = "localhost", port = 5432,
user = "mousaghannam")
dbSendQuery(con, 'set search_path to mimiciii')
<PqResult>
SQL set search_path to mimiciii
ROWS Fetched: 0 [complete]
Changed: 0
# show a list of tables
dbListTables(con)
Closing open result set, cancelling previous query
[1] "chartevents_8" "admissions" "callout" "caregivers" "chartevents_1" "chartevents_2"
[7] "chartevents_3" "chartevents_4" "chartevents_5" "chartevents_6" "chartevents_7" "chartevents_9"
[13] "chartevents_10" "chartevents_11" "icustays" "inputevents_cv" "inputevents_mv" "labevents"
[19] "microbiologyevents" "chartevents_12" "chartevents_13" "chartevents_14" "chartevents" "chartevents_15"
[25] "chartevents_16" "chartevents_17" "cptevents" "datetimeevents" "diagnoses_icd" "drgcodes"
[31] "d_cpt" "d_icd_diagnoses" "prescriptions" "services" "d_icd_procedures" "d_items"
[37] "d_labitems" "noteevents" "outputevents" "patients" "procedureevents_mv" "procedures_icd"
[43] "transfers"
This is a memory conservative approaching for performing SQL queries on DB. We can pull tables in memory and do certain analyses on them, without actually having to save a given object into the R environment.
#Memory conserative approaching for performing SQL queries on DB
#(Rewrite this function to handle multiple inputs )
tbl_mimic <- function(table) {
table <- as.character(substitute(table))
tbl(con, dbplyr::in_schema("mimiciii", table))
}
This “dictionary” will contain all of the diagnoses events in the database associated with a given hospital admission id (hadm_id), along with a description of the icd9 code.
#Pull dictionary for diagnoses
d_icd_diagnoses <- tbl_mimic(d_icd_diagnoses) %>%
select(icd9_code, short_title)
diagnoses_icd <-tbl_mimic(diagnoses_icd) %>%
select(hadm_id, icd9_code, seq_num)
#ICD-9 Codes + their descriptions
icd9_dict <- diagnoses_icd %>%
inner_join(d_icd_diagnoses, by = "icd9_code")
icd9_dict
Saved all of the lab_ids into R environment with collect(), for filtering in the next step.
d_labitems <- tbl_mimic(d_labitems) #Pull dictionary for lab items into memory
d_labitems %>%
filter(str_detect(tolower(label), "potassium")) %>%
collect() -> lab_idsK
lab_idsK
d_labitems %>%
filter(str_detect(tolower(label), "magnesium")) %>%
collect() -> lab_idsMg
lab_idsMg
NA
d_labitems %>%
filter(str_detect(tolower(label), "calcium")) %>%
collect() -> lab_idsCa
lab_idsCa
NA
d_labitems %>%
filter(str_detect(tolower(label), "phosphate")) %>%
collect() -> lab_idsPh
lab_idsPh
NA
We are choosing to include the 1st and 4th row, which are strictly blood lab value draws for potassium. Do the same for the other electrolytes.
#Drugs to include for potassium
itemidsK <- lab_idsK$itemid[c(1,4)]
#Drugs to include for magnesium
itemidsMg <- lab_idsMg$itemid[c(1)]
#Drugs to include for Calcium
itemidsCa <- lab_idsCa$itemid[c(4)]
#Drugs to include for Phosphate
itemidsPh <- lab_idsPh$itemid[c(2)]
We are now taking the table labevents, which contains all the lab events in the database, and selecting the rows associated with the two blood potassium lab values itemids that we chose above.
labevents <- tbl_mimic(labevents)
labevents %>%
filter(itemid %in% itemidsK) %>%
inner_join(select(tbl_mimic(d_labitems),-loinc_code, -row_id ), by = "itemid") %>%
select(subject_id, hadm_id, itemid, charttime, valuenum, valueuom, label, flag, fluid, category) -> labEventsK
labEventsK
labevents %>%
filter(itemid %in% itemidsMg) %>%
inner_join(select(tbl_mimic(d_labitems),-loinc_code, -row_id ), by = "itemid") %>%
select(subject_id, hadm_id, itemid, charttime, valuenum, valueuom, label, flag, fluid, category) -> labEventsMg
labEventsMg
labevents %>%
filter(itemid %in% itemidsCa) %>%
inner_join(select(tbl_mimic(d_labitems),-loinc_code, -row_id ), by = "itemid") %>%
select(subject_id, hadm_id, itemid, charttime, valuenum, valueuom, label, flag, fluid, category) -> labEventsCa
labEventsCa
labevents %>%
filter(itemid %in% itemidsPh) %>%
inner_join(select(tbl_mimic(d_labitems),-loinc_code, -row_id ), by = "itemid") %>%
select(subject_id, hadm_id, itemid, charttime, valuenum, valueuom, label, flag, fluid, category) -> labEventsPh
labEventsPh
# print(paste(" There are ",pull(numLabKevents[1,1]), " total potassium lab values that were extracted from the labevents table." ))
The d_items table in mimics contains a dictionary for all of the events associated with both the metaview and careview database. We are only going to focus on the metaview database at this point. We then want to filter this table by the label description for items that are related to potassium.
d_items <- tbl_mimic(d_items) #Pull dictionary for all lab events into memory
d_items %>%
filter(label %ilike% "%potassium%" | label %ilike% "%kcl%") %>%
filter(str_detect(tolower(dbsource), "meta")) %>%
arrange(label) -> k_items_MV
k_items_MV
NA
Although there are events sourced from either metaview or careview, these two hospital databases are actually linked to 3 different tables in the mimics database: inputevents_mv, inputevents_cv or chartevents. Since we are examining only the metavision database, it will only link to inputevents_mv and chartevents. We would like to select for only the events that link to inputevents_mv.
k_items_MV %>%
filter(linksto=="inputevents_mv") %>% collect() -> k_items_MV_IM
k_items_MV_IM
If you look at the above table, the first row has a KCL (Bolus) with unit mL, while the 5th row has Potassium Chloride with units in mEq. According to the mimics iii website:
“Metavision records IO data using two tables: RANGESIGNALS and ORDERENTRY. These tables do not appear in MIMIC-III as they have been merged to form the > INPUTEVENTS_MV table. RANGESIGNALS contains recorded data elements which last for a fixed period of time. Furthermore, the RANGESIGNALS table recorded > information for each component of the drug separately. For example, for a norepinephrine administration there would be two components: a main order > component (norepinephrine) and a solution component (NaCl).” 1
Therefore, we will only be selecting repletions based off of the “main order component”, and ignoring the solution. We will select for the 4th and 5th rows, which correspond to Potassiume Acetate and Potassium Chloride.
k_items_MV_IM %>% slice(-6,-3, -2,-1) -> k_items_MV_IM #ONLY KEEPING THE REPLETIONS, THE ADDITIVE AMOUNTS, NOT THE BOLUS
k_items_MV_IM
k_items_MV_IM %>% pull(itemid) -> k_items_MV_IM_vector
d_items <- tbl_mimic(d_items) #Pull dictionary for all lab events into memory
d_items %>%
filter(label %ilike% "%magnesium%" | label %ilike% "%mg%") %>%
filter(str_detect(tolower(dbsource), "meta")) %>%
arrange(label) -> mg_items_MV
mg_items_MV %>%
filter(linksto=="inputevents_mv") %>% collect() -> mg_items_MV_IM
mg_items_MV_IM
mg_items_MV_IM %>% slice(1) -> mg_items_MV_IM #ONLY KEEPING THE REPLETIONS, THE ADDITIVE AMOUNTS, NOT THE BOLUS
mg_items_MV_IM
mg_items_MV_IM %>% pull(itemid) -> mg_items_MV_IM_vector
d_items %>%
filter(label %ilike% "%calcium%" ) %>%
filter(str_detect(tolower(dbsource), "meta")) %>%
arrange(label) -> ca_items_MV
ca_items_MV
ca_items_MV %>%
filter(linksto=="inputevents_mv") %>% collect() -> ca_items_MV_IM
ca_items_MV_IM
ca_items_MV_IM %>% slice(1) -> ca_items_MV_IM #ONLY KEEPING THE REPLETIONS, THE ADDITIVE AMOUNTS, NOT THE BOLUS
ca_items_MV_IM
ca_items_MV_IM %>% pull(itemid) -> ca_items_MV_IM_vector
d_items %>%
filter(label %ilike% "%phosphate%" | label %ilike% "na%") %>%
filter(str_detect(tolower(dbsource), "meta")) %>%
arrange(label) -> ph_items_MV
ph_items_MV
ph_items_MV %>%
filter(linksto=="inputevents_mv") %>% collect() -> ph_items_MV_IM
ph_items_MV_IM
ph_items_MV_IM %>% slice(1,8) -> ph_items_MV_IM #ONLY KEEPING THE REPLETIONS, THE ADDITIVE AMOUNTS, NOT THE BOLUS
ph_items_MV_IM
ph_items_MV_IM %>% pull(itemid) -> ph_items_MV_IM_vector
We will now use the filtered ids for the two additive solutions above to pull electrolyte repletions from the inputevents_mv table.
inputevents_mv <- tbl_mimic(inputevents_mv)
inputevents_mv %>%
filter(itemid %in% k_items_MV_IM_vector) %>%
select(subject_id, hadm_id,icustay_id, linkorderid, orderid, itemid, starttime, endtime, amount, amountuom, rate, rateuom, statusdescription, ordercomponenttypedescription) %>%
filter(!statusdescription == "Rewritten") %>%
rename(itemid.repletion = itemid) -> repEventsK
repEventsK
Determine the amount of potassium repletions:
repEventsK %>% count() %>% collect() -> numKRepletions
print(paste(" There are ",pull(numKRepletions[1,1]), " total potassium repletions that were extracted from the inputevents_mv table." ))
[1] " There are 66429 total potassium repletions that were extracted from the inputevents_mv table."
inputevents_mv %>%
filter(itemid %in% mg_items_MV_IM_vector) %>%
select(subject_id, hadm_id,icustay_id, linkorderid, orderid, itemid, starttime, endtime, amount, amountuom, rate, rateuom, statusdescription, ordercomponenttypedescription) %>%
filter(!statusdescription == "Rewritten") %>%
rename(itemid.repletion = itemid) -> repEventsMg
repEventsMg
inputevents_mv %>%
filter(itemid %in% ca_items_MV_IM_vector) %>%
select(subject_id, hadm_id,icustay_id, linkorderid, orderid, itemid, starttime, endtime, amount, amountuom, rate, rateuom, statusdescription, ordercomponenttypedescription) %>%
filter(!statusdescription == "Rewritten") %>%
rename(itemid.repletion = itemid) -> repEventsCa
repEventsCa
inputevents_mv %>%
filter(itemid %in% ph_items_MV_IM_vector) %>%
select(subject_id, hadm_id,icustay_id, linkorderid, orderid, itemid, starttime, endtime, amount, amountuom, rate, rateuom, statusdescription, ordercomponenttypedescription) %>%
filter(!statusdescription == "Rewritten") %>%
rename(itemid.repletion = itemid) -> repEventsPh
repEventsPh
Both the tables containing the lab draws and the repletion events have been extracted. We can now join them to begin analyzing the relationship between them.
In addition to joining the tables, we have renamed the column corresponding to the moment the fluid collection for the lab value was recorded, charttime, as charttime.lab for clarity.
We want to flag all repletion events that occurred either 24 hours before or 24 hours after a given lab value. We then filter the table, storing all of the repletions occurring BEFORE a given lab value in one table, and all of the ones occurring AFTER a given lab value in another.
We are going to do analysis here on the dataset containing repletions. We are making this distinction, as later on, we will do analysis on non-repletions.
In this table we are just doing some summary analysis on the dataset.
postExclusionsK_Repleted %>%
group_by( preVsPost) %>%
summarize(n = n(), mean = mean(valuenum, na.rm = TRUE), standard_deviation = sd(valuenum, na.rm = T))
We have a table that contains all of the repletion events that occur either 24 hours prior or after a given lab value. We then group the rows by each subject and hospital id, as well as the time of fluid acquisition for the lab value. Then, we create a column that generates a flag if the lab value that is chosen is the minimum starttime for all the possible starttimes. Because there are multiple lab values, as well as multiple repletion events, for a given hadm_id the table outputted has all the possible permutations. We want to select the most recent repletion AFTER a lab value, so we choose the repletion with the starttime equal to the minimum starttime for a given lab value.
#Potassium lab event PRE-repletions
allRepLabEvents_k %>%
filter(isRecentPre) %>%
group_by(subject_id, hadm_id,charttime.lab) %>%
mutate(isMostRecentRepletion = starttime == min(starttime)) %>%
filter(isMostRecentRepletion) %>%
ungroup() %>%
group_by(subject_id, hadm_id, starttime) %>%
mutate(isMostRecentLabEvent = charttime.lab == max(charttime.lab)) %>%
filter(isMostRecentLabEvent) %>% distinct() -> pre_k_lab_repletions_MV_new
pre_k_lab_repletions_MV_new
For the post-repletion lab values, we are now doing the same thing, but instead finding the maximum endtime prior to a given lab value, to select the most recent repletion prior to a given lab value.
#Potassium lab event POST-repletions
allRepLabEvents_k %>%
filter(isRecentPost) %>%
group_by(subject_id, hadm_id,charttime.lab) %>%
mutate(isMostRecentRepletion = endtime == max(endtime)) %>%
filter(isMostRecentRepletion) %>%
ungroup() %>%
group_by(subject_id, hadm_id, endtime) %>%
mutate(isMostRecentLabEvent = charttime.lab == min(charttime.lab)) %>%
filter(isMostRecentLabEvent) %>% distinct() -> post_k_lab_repletions_MV_new
post_k_lab_repletions_MV_new
Doing it on the combined dataset
allEs_pre_post_repletions_MV_new %>%
select(linkorderid, Electrolyte) %>%
distinct()%>%
group_by(Electrolyte) %>%
summarize(n = n())
Adding missing grouping variables: `subject_id`, `hadm_id`, `starttime`
`summarise()` ungrouping output (override with `.groups` argument)
We then combine all of the values from the “pre-repletion” and “post-repletion” tables, into one table. We create a new column preVsPost to indicate if a given event is either before or after a lab value. Conceivably, the same repletion could correspond to a
#Combine them into one dataset
k_pre_post_repletions_MV_new <- bind_rows(list(pre_repletion = pre_k_lab_repletions_MV_new, post_repletion = post_k_lab_repletions_MV_new), .id = "preVsPost")
k_pre_post_repletions_MV_new %>% distinct()
#This gives the amount of repletions events with the same linkorder id. Anything over 2 might need to be exlcuded, as this should have been broken up. It reflects the inherent error in EHRs it seems like.
k_pre_post_repletions_MV_new %>%
group_by(linkorderid) %>%
summarize(observations = n()) %>%
arrange(desc(observations)) %>%
count(observations)
`summarise()` ungrouping output (override with `.groups` argument)
#Filters all of the rows where there is a post and a pre-repletion value (hopefully lol)
k_pre_post_repletions_MV_new %>%
group_by(linkorderid) %>%
filter(n() == 1) %>%
ungroup()%>%
group_by(preVsPost) %>%
summarize( mean = mean(valuenum, na.rm = T), obs = n(), sd = sd(valuenum, na.rm = T))
`summarise()` ungrouping output (override with `.groups` argument)
k_pre_post_repletions_MV_new %>%
group_by(linkorderid) %>%
filter(n() == 2) %>%
ungroup()%>%
group_by(preVsPost) %>%
summarize( mean = mean(valuenum, na.rm = T), obs = n(), sd = sd(valuenum, na.rm = T))
`summarise()` ungrouping output (override with `.groups` argument)
#This gives the frequency for the amount of unique values
k_pre_post_repletions_MV_new %>%
group_by(charttime.lab) %>%
summarize(observations = n()) %>%
arrange(desc(observations)) %>%
count(observations)
`summarise()` ungrouping output (override with `.groups` argument)
allRepLabEvents_mg %>%
filter(isRecentPre) %>%
group_by(subject_id, hadm_id,charttime.lab) %>%
mutate(isMostRecentRepletion = starttime == min(starttime)) %>%
filter(isMostRecentRepletion) %>%
ungroup() %>%
group_by(subject_id, hadm_id, starttime) %>%
mutate(isMostRecentLabEvent = charttime.lab == max(charttime.lab)) %>%
filter(isMostRecentLabEvent) %>% distinct() -> pre_mg_lab_repletions_MV_new
pre_mg_lab_repletions_MV_new
#Magnesium lab event POST-repletions
allRepLabEvents_mg %>%
filter(isRecentPost) %>%
group_by(subject_id, hadm_id,charttime.lab) %>%
mutate(isMostRecentRepletion = endtime == max(endtime)) %>%
filter(isMostRecentRepletion) %>%
ungroup() %>%
group_by(subject_id, hadm_id, endtime) %>%
mutate(isMostRecentLabEvent = charttime.lab == min(charttime.lab)) %>%
filter(isMostRecentLabEvent) %>% distinct() -> post_mg_lab_repletions_MV_new
post_mg_lab_repletions_MV_new
#Combine them into one dataset
mg_pre_post_repletions_MV_new <- bind_rows(list(pre_repletion = pre_mg_lab_repletions_MV_new, post_repletion = post_mg_lab_repletions_MV_new), .id = "preVsPost")
mg_pre_post_repletions_MV_new %>% distinct()
#Calcium lab event PRE-repletions
allRepLabEvents_ca %>%
filter(isRecentPre) %>%
group_by(subject_id, hadm_id,charttime.lab) %>%
mutate(isMostRecentRepletion = starttime == min(starttime)) %>%
filter(isMostRecentRepletion) %>%
ungroup() %>%
group_by(subject_id, hadm_id, starttime) %>%
mutate(isMostRecentLabEvent = charttime.lab == max(charttime.lab)) %>%
filter(isMostRecentLabEvent) %>% distinct() -> pre_ca_lab_repletions_MV_new
pre_ca_lab_repletions_MV_new
#Calcium lab event POST-repletions
allRepLabEvents_ca %>%
filter(isRecentPost) %>%
group_by(subject_id, hadm_id,charttime.lab) %>%
mutate(isMostRecentRepletion = endtime == max(endtime)) %>%
filter(isMostRecentRepletion) %>%
ungroup() %>%
group_by(subject_id, hadm_id, endtime) %>%
mutate(isMostRecentLabEvent = charttime.lab == min(charttime.lab)) %>%
filter(isMostRecentLabEvent) %>% distinct() -> post_ca_lab_repletions_MV_new
post_ca_lab_repletions_MV_new
#Combine them into one dataset
ca_pre_post_repletions_MV_new <- bind_rows(list(pre_repletion = pre_ca_lab_repletions_MV_new, post_repletion = post_ca_lab_repletions_MV_new), .id = "preVsPost")
ca_pre_post_repletions_MV_new %>% distinct()
NA
#Phosphate lab event PRE-repletions
allRepLabEvents_ph %>%
filter(isRecentPre) %>%
group_by(subject_id, hadm_id,charttime.lab) %>%
mutate(isMostRecentRepletion = starttime == min(starttime)) %>%
filter(isMostRecentRepletion) %>%
ungroup() %>%
group_by(subject_id, hadm_id, starttime) %>%
mutate(isMostRecentLabEvent = charttime.lab == max(charttime.lab)) %>%
filter(isMostRecentLabEvent) %>% distinct() -> pre_ph_lab_repletions_MV_new
pre_ph_lab_repletions_MV_new
#Phosphate lab event POST-repletions
allRepLabEvents_ph %>%
filter(isRecentPost) %>%
group_by(subject_id, hadm_id,charttime.lab) %>%
mutate(isMostRecentRepletion = endtime == max(endtime)) %>%
filter(isMostRecentRepletion) %>%
ungroup() %>%
group_by(subject_id, hadm_id, endtime) %>%
mutate(isMostRecentLabEvent = charttime.lab == min(charttime.lab)) %>%
filter(isMostRecentLabEvent) %>% distinct() -> post_ph_lab_repletions_MV_new
post_ph_lab_repletions_MV_new
#Combine them into one dataset
ph_pre_post_repletions_MV_new <- bind_rows(list(pre_repletion = pre_ph_lab_repletions_MV_new, post_repletion = post_ph_lab_repletions_MV_new), .id = "preVsPost")
ph_pre_post_repletions_MV_new %>% distinct()
NA
#Combine all datasets into one We’re going to make one giant dataset across all of the electrolytes.
all_lytes_pre_post_repletions_MV_new %>%
select(linkorderid, Electrolyte) %>%
distinct()%>%
group_by(Electrolyte) %>%
summarize(n = n())
Adding missing grouping variables: `subject_id`, `hadm_id`, `starttime`
`summarise()` ungrouping output (override with `.groups` argument)
We now want to exclude all the rows based on certain criteria.
We have a table called hadm_id_table which contains all of the hadm_ids of patients with diagnoses that would confound our results (kidney disease, etc). (WILL ADD HOW I CREATED THIS TABLE LATER).
hadm_id_table
First bubble of schemtic
#The amount of distinct repletions across each electrolyte
all_repletions_MV_new_AEs %>%
ungroup()%>%
select(linkorderid, Electrolyte) %>%
distinct() %>%
group_by(Electrolyte) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
kable()
`summarise()` ungrouping output (override with `.groups` argument)
| Electrolyte | n |
|---|---|
| potassium | 63365 |
| magnesium | 24601 |
| calcium | 20206 |
| phosphate | 1339 |
Third bubble of schematic: Those that have been excluded pre and post 24 hrs, and also the most recent lab value before and after each repletion
allEs_pre_post_repletions_MV_new %>% #Amount of DISTINCT Repletion Events left
ungroup()%>%
anti_join(hadm_id_table, by = "hadm_id") %>%
select(linkorderid, Electrolyte) %>%
distinct() %>%
group_by(Electrolyte) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
kable()
`summarise()` ungrouping output (override with `.groups` argument)
| Electrolyte | n |
|---|---|
| potassium | 17755 |
| magnesium | 8008 |
| calcium | 4055 |
| phosphate | 348 |
As a result, we could simply do a semi-join to determine how many people in each group were excluded. Note, we are doing the anti-join on the table with the most recent repletion before and after a given lab result, within a 24 hour time range. We are not doing it on the set of ALL possible repletion and or lab result events.
hadm_id_table %>%
semi_join(all_lytes_pre_post_repletions_MV_new, by = "hadm_id") %>%
select(hadm_id, .id) %>%
group_by(.id) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
kable()
## Number of hospital visits for potassium
hadm_id_table %>%
# semi_join(k_pre_post_repletions_MV_new, by = "hadm_id") %>%
inner_join(k_pre_post_repletions_MV_new, by = "hadm_id") %>%
select(hadm_id, .id) %>%
group_by(.id) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
kable()
k_pre_post_repletions_MV_new %>%
ungroup()%>%
semi_join(hadm_id_table, by = "hadm_id") %>%
select(icustay_id, orderid, .id) %>%
group_by(.id) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
kable()
And here is the resultant dataset after actually excluding this time, using an anti-join instead.
k_pre_post_repletions_MV_new %>%
anti_join(hadm_id_table, by = "hadm_id") -> postExclusionsK_Repleted
postExclusionsK_Repleted
###Another Exclusions
k_pre_post_repletions_MV_new %>%
semi_join(hadm_id_table, by = "hadm_id")
hadm_id_table %>%
semi_join(k_pre_post_repletions_MV_new, by = "hadm_id")
We will now determine the set of lab values for which there was no immediate repletion afterward.
In this section, we have started by filtering all the repletions prior and after a lab value, just like for the analysis on the repletions. However, this time, if there are multiple lab values that occur before a given repletion (or after), we are taking the ones after the first one.
#All electrolyte lab event PRE-repletions
all_electrolyte_repLabEvents %>%
filter(isRecentPre) %>%
group_by(subject_id, hadm_id,charttime.lab, Electrolyte) %>%
mutate(isMostRecentRepletion = starttime == min(starttime)) %>%
filter(isMostRecentRepletion) %>%
ungroup() %>%
group_by(subject_id, hadm_id, starttime, Electrolyte) %>%
mutate(isMostRecentLabEvent = charttime.lab == max(charttime.lab)) %>%
filter(!isMostRecentLabEvent) %>% distinct() -> nonRepletLabsPre_allEs
#All electrolyte lab event POST-repletions
all_electrolyte_repLabEvents %>%
filter(isRecentPost) %>%
group_by(subject_id, hadm_id,charttime.lab, Electrolyte) %>%
mutate(isMostRecentRepletion = endtime == max(endtime)) %>%
filter(isMostRecentRepletion) %>%
ungroup() %>%
group_by(subject_id, hadm_id, endtime, Electrolyte) %>%
mutate(isMostRecentLabEvent = charttime.lab == min(charttime.lab)) %>%
filter(!isMostRecentLabEvent) %>% distinct() -> nonRepletLabsPost_allEs
allNonRepletions_allEs <- bind_rows(list(pre_repletion = nonRepletLabsPre_allEs, post_repletion = nonRepletLabsPost_allEs), .id = "preVsPost")
allNonRepletions_allEs %>%
group_by(preVsPost, Electrolyte) %>%
summarize(n = n(), mean = mean(valuenum, na.rm = TRUE), standard_deviation = sd(valuenum, na.rm = T))
`summarise()` regrouping output by 'preVsPost' (override with `.groups` argument)
We now want to exclude all the rows based on certain criteria.
We have a table called hadm_id_table which contains all of the hadm_ids of patients with diagnoses that would confound our results (kidney disease, etc). (WILL ADD HOW I CREATED THIS TABLE LATER).
hadm_id_table
As a result, we could simply do a semi-join to determine how many people in each group were excluded. Note, we are doing the anti-join on the table with the most recent repletion before and after a given lab result, within a 24 hour time range. We are not doing it on the set of ALL possible repletion and or lab result events.
hadm_id_table %>%
semi_join(all_lytes_pre_post_repletions_MV_new, by = "hadm_id") %>%
select(hadm_id, .id) %>%
group_by(.id) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
kable()
`summarise()` ungrouping output (override with `.groups` argument)
| .id | n |
|---|---|
| renal-codes | 8127 |
| afib-codes | 4013 |
| aki-codes | 3914 |
| chf-codes | 3332 |
| cad-codes | 1262 |
| esrd-codes | 735 |
| prbc | 613 |
| ckd-codes | 535 |
| dialysis-codes | 219 |
| rhabdo-codes | 180 |
| paralyzed-codes | 164 |
| parathyroid-codes | 66 |
| sarcoid-codes | 58 |
| pediatrics | 20 |
| burn-codes | 15 |
| nutrit-def | 7 |
And here is the resultant dataset after actually excluding this time, using an anti-join instead.
k_pre_post_repletions_MV_new %>%
anti_join(hadm_id_table, by = "hadm_id") -> postExclusionsK_Repleted
postExclusionsK_Repleted
###Another Exclusions
k_pre_post_repletions_MV_new %>%
semi_join(hadm_id_table, by = "hadm_id")
hadm_id_table %>%
semi_join(k_pre_post_repletions_MV_new, by = "hadm_id")
We are going to do analysis here on the dataset containing repletions. We are making this distinction, as later on, we will do analysis on non-repletions.
In this table we are just doing some summary analysis on the dataset.
postExclusionsK_Repleted %>%
group_by( preVsPost) %>%
summarize(n = n(), mean = mean(valuenum, na.rm = TRUE), standard_deviation = sd(valuenum, na.rm = T))
Histogram of Pre and Post Repletions
ggpar(gghistogram(data = postExclusionsK_Repleted, x="valuenum", fill = "preVsPost", add="mean", palette = c("#00AFBB", "#E7B800"),add_density = FALSE, bins = 40, gggtheme = theme_pubr(), xlab = "Lab Value", title = "Pre Vs Post Repletion Lab Values - Potassium", ylab = "Repletions")
, xlim = c(2,6))
ggplot(postExclusionsK_Repleted, aes(x = valuenum,fill=preVsPost)) +
geom_histogram(alpha=0.6, position="identity",bins=40, color="black") + scale_x_continuous(limits = c(2,6) ) + theme_pubr() + xlab("Potassium Value") + ylab("Number of Occurrences") + scale_fill_discrete(name = "", labels = c("Post repletion K", "Pre repletion K")) -> h1_k
h1_k
postExclusionsK_Repleted %>%
mutate(repletionRange = case_when(valuenum < 3.5 & flag == "abnormal" ~ "below normal",
valuenum > 3.5 & flag == "abnormal" ~ "above range",TRUE ~ "within range")) -> postExclusionsK_Repleted
postExclusionsK_Repleted %>%
filter(repletionRange == "above range") %>%
group_by(valuenum) %>%
summarize(observations = n()) %>%
arrange(valuenum)
postExclusion_freq <- postExclusionsK_Repleted %>%
group_by(repletionRange) %>%
summarize(observations = n()) %>%
mutate(percentage = observations / sum(observations) * 100)
ggplot(postExclusion_freq, aes("", percentage, fill = repletionRange)) +
geom_bar(stat = "identity", color = "white", size = 1) +
coord_polar(theta = "y") +
scale_fill_manual(values =c( "#B3B3B3","#27408B","#EE7621"),name = " ",
labels = c( "Above 5.2","Below 3.4", "3.4-5.2" )
) +
theme_void() + theme(legend.position="top") -> pie_chart_k
pie_chart_k
allNonRepletions <- bind_rows(list(pre_repletion = nonRepletLabsPre, post_repletion = nonRepletLabsPost), .id = "preVsPost")
#Run Exclusions
allNonRepletions %>%
anti_join(hadm_id_table, by = "hadm_id") -> postexclusion_nonRepletions
postexclusion_nonRepletions
postexclusion_nonRepletions %>%
group_by( preVsPost) %>%
summarize(n = n(), mean = mean(valuenum, na.rm = TRUE), standard_deviation = sd(valuenum, na.rm = T))
repletedVsNonRepleted_PreV <- bind_rows(list(repleted = postExclusionsK_Repleted, nonRepleted = postexclusion_nonRepletions), .id = "repleteVsNon") %>% filter(preVsPost == "pre_repletion")
repletedVsNonRepleted_PreV %>% distinct()
#Table
repletedVsNonRepleted_PreV %>%
group_by(repleteVsNon,flag) %>%
summarize(observations = n(),mean = mean(valuenum, na.rm = T), std_dev = sd(valuenum, na.rm=T)) %>%
mutate(percentage = observations / sum(observations) * 100)
#Visualizing the non-repleted vs repleted lab values
ggpar(gghistogram(data = repletedVsNonRepleted_PreV, x="valuenum", fill = "flag", add="mean",position = "identity", palette = c("#00AFBB", "#E7B800"),add_density = FALSE, bins = 100, gggtheme = theme_pubr(), xlab = "Lab Value", title = "Pre Vs Post Repletion Lab Values - Potassium", ylab = "Repletions", facet.by = "repleteVsNon")
, xlim = c(2,6))
Question: How can there be histogram bars that are “abnormal” on top of bars that are white? Are they just not granular enough to differentiate them? I think there are errors in how the data was flagged. Should look more into this. #### Looking at the outliers
repletedVsNonRepleted_PreV %>%
filter(repleteVsNon == "repleted") %>%
filter(flag == "abnormal" & valuenum > 4) %>%
ungroup()%>%
summarize(observations = n(),mean = mean(valuenum, na.rm = T), std_dev = sd(valuenum, na.rm=T))
postexclusion_nonRepletions %>%
filter(flag == "abnormal" & valuenum < 4) %>%
ungroup() %>%
summarize(observations = n(),mean = mean(valuenum, na.rm = T), std_dev = sd(valuenum, na.rm=T))
postexclusion_nonRepletions %>%
filter(valuenum < 3.5 & flag == "abnormal") %>%
ggplot(aes(x = valuenum)) +
geom_histogram(alpha=0.6, position="identity",bins=14, color="black") + scale_x_continuous(limits = c(1,3.5)) + theme_pubr()
###Amount of people that replete (vs Non-Replete) across lab value
repletedVsNonRepleted_PreV %>%
group_by(valuenum, repleteVsNon) %>%
summarize(observations = n() ) %>%
ggplot(aes(fill=repleteVsNon, y=observations, x=valuenum)) +
geom_bar(position="fill", stat="identity") + scale_x_continuous() + theme_pubr()
Examining the time of day for each repletion and lab value.
repletedVsNonRepleted_PreV %>%
mutate(RepletionHour = hour(starttime), LabHour = hour(charttime.lab)) %>%
gghistogram(x = c("LabHour","RepletionHour"), bins = 24, palette = "uchicago", alpha = 0.5, merge = T, xlab = "Hours", ylab = "Number of Occurences")
repletedVsNonRepleted_PreV %>%
ungroup() %>%
mutate(LabHour = hour(charttime.lab) + 4, RepletionHour = hour(starttime)) %>%
pivot_longer(cols = c(LabHour,RepletionHour), values_to="Hour", names_to="WhichHour") -> yy
ordered(yy$WhichHour)
yy$WhichHour <- factor(yy$WhichHour, ordered = FALSE)
yy$WhichHour <- relevel(yy$WhichHour,"RepletionHour")
ggplot(yy, aes(x = Hour,fill=WhichHour)) + geom_histogram(position="identity",bins = 24,alpha=0.6) +
scale_x_continuous(breaks=seq(0,24,2)) + theme_pubr()
ggplot(yy, aes(x = Hour,fill=WhichHour)) +
geom_histogram(alpha=0.4, position="identity",bins=24, color="black") + scale_x_continuous(breaks=seq(0,24,2)) + facet_wrap(vars(repleteVsNon))
Below, we have the time of day for lab value draws that occur for lab values that were repleted, and those that did not directly lead to a repletion.
It seems that those leading to a repltion were more likely to occur in the morning, while those that did not directly lead to a repletion were more likely to occur in the afternoon.
Why is this occurring? Perhaps, repletions in the morning are routine, but there is no routine in performing repletions in the afternoon.
repletedVsNonRepleted_PreV %>%
ungroup() %>%
mutate(LabHour = hour(charttime.lab)) %>%
filter(preVsPost == "pre_repletion") %>%
ggplot(aes(x = LabHour,fill=repleteVsNon)) + geom_histogram(position="identity",bins = 24,alpha=0.6, color = "black") + scale_x_continuous(breaks=seq(0,24,2)) + theme_pubr() + xlab("Hour of Day") + ylab("Number of Lab Values") + scale_fill_discrete(name = "", labels = c("Non-Repletion", "Repletion"))
repletedVsNonRepleted_PreV %>%
ungroup() %>%
mutate(LabHour = hour(charttime.lab)) %>%
filter(preVsPost == "pre_repletion") %>%
group_by(LabHour, repleteVsNon) %>%
summarize(mean_lab_value = mean(valuenum, na.rm = T)) %>%
pivot_wider(names_from = repleteVsNon, values_from = mean_lab_value)
repletedVsNonRepleted_PreV %>%
ungroup() %>%
mutate(LabHour = hour(charttime.lab)) %>%
filter(preVsPost == "pre_repletion") %>%
group_by(LabHour, repleteVsNon) %>%
summarize(mean_lab_value = mean(valuenum, na.rm = T)) %>%
ggbarplot(x="LabHour", y = "mean_lab_value", fill = "repleteVsNon", facet.by = "repleteVsNon" ) +
scale_x_continuous(breaks=seq(0,24,2)) + theme_pubr()
repletedVsNonRepleted_PreV %>%
ungroup() %>%
mutate(LabHour = hour(charttime.lab)) %>%
filter(preVsPost == "pre_repletion") %>%
group_by(LabHour, repleteVsNon) %>%
summarize(mean_lab_value = mean(valuenum, na.rm = T), std_dev = sd(valuenum, na.rm = T)) %>%
ggplot(aes(x=LabHour,y=mean_lab_value,group=repleteVsNon, color = repleteVsNon)) + geom_line() + geom_point() + scale_x_continuous(breaks=seq(0,24,2)) + theme_pubr() + geom_errorbar(aes(ymin=mean_lab_value-std_dev, ymax=mean_lab_value+std_dev), width=.2,
position=position_dodge(0.05))
repletedVsNonRepleted_PreV %>%
ungroup() %>%
mutate(LabHour = hour(charttime.lab)) %>%
filter(preVsPost == "pre_repletion") %>%
group_by(LabHour, repleteVsNon) %>%
summarize(mean_lab_value = mean(valuenum, na.rm = T)) %>%
ggplot(aes(x=LabHour,y=mean_lab_value,fill=repleteVsNon)) + geom_bar(stat="identity", position = "dodge", color = "black") + scale_x_continuous(breaks=seq(0,24,2)) + theme_pubr()